library("tidyverse")
## Warning: пакет 'tidyverse' был собран под R версии 4.2.3
## Warning: пакет 'ggplot2' был собран под R версии 4.2.3
## Warning: пакет 'tibble' был собран под R версии 4.2.3
## Warning: пакет 'tidyr' был собран под R версии 4.2.3
## Warning: пакет 'readr' был собран под R версии 4.2.3
## Warning: пакет 'purrr' был собран под R версии 4.2.3
## Warning: пакет 'dplyr' был собран под R версии 4.2.3
## Warning: пакет 'stringr' был собран под R версии 4.2.3
## Warning: пакет 'forcats' был собран под R версии 4.2.3
## Warning: пакет 'lubridate' был собран под R версии 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
movieset <- read.csv("Top_rated_movies1.csv")
movieset$release_date <- as.Date(movieset$release_date)

Изучаем датасет:

rmarkdown::paged_table(movieset)

Задачи:

MSDate <- movieset|>
  group_by(release_date)|>
  summarize(movie_number = n())

ggplot(MSDate, aes(x = release_date, y = movie_number)) + theme_bw()+
geom_point()

Чем-то похоже на параболу.

summary(movieset$release_date)
##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "1902-04-17" "1998-01-30" "2009-08-03" "2004-08-20" "2016-08-02" "2023-11-23"

Исследуем зависимость количественных атрибутов

range_year <- function(year){
  if(year <= 1950)
    return (paste("1902", "1950", sep='-'))
  if (year < 2000)
    return (paste(year - year%% 25, year - year %% 25 + 25, sep='-'))
  if(year < 2020)
    return(paste(year - year %% 5, year - year %% 5 + 5, sep='-'))
  return (paste(year - year%%5, "2023", sep='-'))
}
MSPop <- movieset|>
  group_by(title, popularity)|>
  mutate(score = sum(vote_average*vote_count),
         year = year(release_date),
         month = month(release_date),
         day = day(release_date))
MSRange <- MSPop|>
  mutate(year.range = sapply(year, range_year))
table(MSRange$year.range)
## 
## 1902-1950 1950-1975 1975-2000 2000-2005 2005-2010 2010-2015 2015-2020 2020-2023 
##       144       479      1837       865      1218      1533      1833       922
summary(MSRange$year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1902    1998    2009    2004    2016    2023

Производство фильмов по годам

MSYear <- MSPop|>
  group_by(year)|>
  summarise(year.number = n())

ggplot(MSYear, aes(x = year, y = log10(year.number)))+theme_classic()+
  geom_line()

ggplot(MSYear, aes(x = year, y = year.number))+theme_classic()+
  geom_line() 

rmarkdown::paged_table(MSYear)
summary(MSYear$year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1902    1943    1970    1969    1996    2023
summary(MSYear$year.number)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.75   24.00   81.77  117.00  391.00

По месяцам:

MSMonth <- MSPop|>
  group_by(month)|>
  summarise(month.number = n())

barplot(MSMonth$month.number, names.arg = month.name[MSMonth$month],
        xlab = "Month", ylab = "Movies produced",
        main = "Movies produced from 1902 to 2023\nSelected by month")

summary(MSMonth$month.number)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   533.0   645.5   702.5   735.9   801.0   991.0

По дням:

MSDay <- MSPop|>
  group_by(day)|>
  summarize(day.number = n())

barplot(MSDay$day.number, names.arg = MSDay$day,
        xlab = "Day",
        ylab = "Movies produced",
        main = "Movies produced from 1902 to 2023\nSelected by day")+theme_classic()

## NULL
summary(MSDay$day.number)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   136.0   255.0   292.0   284.9   317.0   347.0

Самый продуктивный год/месяц/день:

MSYear$year[which.max(MSYear$year.number)]
## [1] 2018
month.name[MSMonth$month[which.max(MSMonth$month.number)]]
## [1] "September"
MSDay$day[which.max(MSDay$day.number)]
## [1] 12
summary(MSPop$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       4    3032    5569   13825   13131  291052
library("GGally")
## Warning: пакет 'GGally' был собран под R версии 4.2.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(
  MSRange|>mutate(popularity = log10(popularity)),                
  columns = which(names(MSRange) %in% c("score", "vote_average",
                                      "vote_count", "popularity", "release_date")),        
  aes(
    alpha = 0.5,
    col = year.range
  )
)

library("GGally")
ggpairs(
  MSRange|>mutate(popularity = log10(popularity)),                
  columns = which(names(MSRange) %in% c("score", "vote_average",
                                      "vote_count", "popularity", "year")),        
  aes(
    alpha = 0.5,
    col = year.range
  )
)

library("GGally")
ggpairs(
  MSRange|>mutate(popularity = log10(popularity)),                
  columns = which(names(MSRange) %in% c("score", "vote_average",
                                      "vote_count", "popularity", "month")),        
  aes(
    alpha = 0.5,
    col = year.range
  )
)

library("GGally")
ggpairs(
  MSRange|>mutate(popularity = log10(popularity), vote_average = log10(vote_average), vote_count = log10(vote_count)),                
  columns = which(names(MSRange) %in% c("score", "vote_average",
                                      "vote_count", "popularity", "day")),        
  aes(
    alpha = 0.5,
    col = year.range,
  ),
)

library("GGally")
ggpairs(
  MSRange|>mutate(popularity = log10(popularity)),                
  columns = which(names(MSRange) %in% c("score", "vote_average",
                                      "vote_count", "popularity", "id")),        
  aes(
    alpha = 0.5,
    col = year.range
  )
)

MSRange|>
  filter(year.range == "1902-1950")|>as.data.frame()|>
  slice_max(order_by = popularity, n = 10)
##       id                           title
## 1  11224                      Cinderella
## 2  11360                           Dumbo
## 3    408 Snow White and the Seven Dwarfs
## 4    630                The Wizard of Oz
## 5  10895                       Pinocchio
## 6   3170                           Bambi
## 7   1585           It's a Wonderful Life
## 8    770              Gone with the Wind
## 9  11881          Miracle on 34th Street
## 10    15                    Citizen Kane
##                                                                                                                                                                                                                                                                                                                                                                                        overview
## 1  Cinderella has faith her dreams of a better life will come true. With help from her loyal mice friends and a wave of her Fairy Godmother's wand, Cinderella's rags are magically turned into a glorious gown and off she goes to the Royal Ball. But when the clock strikes midnight, the spell is broken, leaving a single glass slipper... the only key to the ultimate fairy-tale ending!
## 2                                                                                                                                                                              Dumbo is a baby elephant born with over-sized ears and a supreme lack of confidence. But thanks to his even more diminutive buddy Timothy the Mouse,  the pint-sized pachyderm learns to surmount all obstacles.
## 3                                                                                                                              A beautiful girl, Snow White, takes refuge in the forest in the house of seven dwarfs to hide from her stepmother, the wicked Queen. The Queen is jealous because she wants to be known as "the fairest in the land," and Snow White's beauty surpasses her own.
## 4                                                                           Young Dorothy finds herself in a magical world where she makes friends with a lion, a scarecrow and a tin man as they make their way along the yellow brick road to talk with the Wizard and ask for the things they miss most in their lives. The Wicked Witch of the West is the only thing that could stop them.
## 5                When loving Geppetto creates a wooden puppet, his wish is granted when it comes to life as a little wooden boy named Pinocchio. With his faithful friend and conscience Jiminy Cricket by his side, Pinocchio, embarks on fantastic adventures that his bravery, loyalty and honesty until triumphs in his triumphs in his quest for his heart's desire: to become a real boy.
## 6                                                                                                                                                                                                                                                                            Bambi's tale unfolds from season to season as the young prince of the forest learns about life, love, and friends.
## 7                                                           A holiday favourite for generations...  George Bailey has spent his entire life giving to the people of Bedford Falls.  All that prevents rich skinflint Mr. Potter from taking over the entire town is George's modest building and loan company.  But on Christmas Eve the business's $8,000 is lost and George's troubles begin.
## 8                                                                                                                                                                                                                               The spoiled daughter of a Georgia plantation owner conducts a tumultuous romance with a cynical profiteer during the American Civil War and Reconstruction Era.
## 9                                                                                                    Kris Kringle, seemingly the embodiment of Santa Claus, is asked to portray the jolly old fellow at Macy's following his performance in the Thanksgiving Day parade. His portrayal is so complete that many begin to question if he truly is Santa Claus, while others question his sanity.
## 10                                                                                               Newspaper magnate, Charles Foster Kane is taken from his mother as a boy and made the ward of a rich industrialist. As a result, every well-meaning, tyrannical or self-destructive move he makes for the rest of his life appears in some way to be a reaction to that deeply wounding event.
##    popularity release_date vote_average vote_count     score year month day
## 1      87.122   1950-02-22        7.042       6298 44350.516 1950     2  22
## 2      80.097   1941-10-31        6.997       4634 32424.098 1941    10  31
## 3      72.349   1937-12-21        7.122       6912 49227.264 1937    12  21
## 4      70.664   1939-08-15        7.575       5167 39140.025 1939     8  15
## 5      66.270   1940-02-23        7.103       5483 38945.749 1940     2  23
## 6      66.096   1942-08-14        7.009       5284 37035.556 1942     8  14
## 7      63.569   1946-12-20        8.263       3905 32267.015 1946    12  20
## 8      49.256   1939-12-15        7.968       3739 29792.352 1939    12  15
## 9      43.762   1947-06-04        7.336        648  4753.728 1947     6   4
## 10     40.691   1941-04-17        8.016       5102 40897.632 1941     4  17
##    year.range
## 1   1902-1950
## 2   1902-1950
## 3   1902-1950
## 4   1902-1950
## 5   1902-1950
## 6   1902-1950
## 7   1902-1950
## 8   1902-1950
## 9   1902-1950
## 10  1902-1950
MSRange |>
  filter(year.range == "1902-1950")|>as.data.frame()|>
  slice_min(order_by = popularity, n = 10)
##       id                                            title
## 1  27040                          Meshes of the Afternoon
## 2  37719                             A Night at the Opera
## 3  10728                                            Faust
## 4   3059 Intolerance: Love's Struggle Throughout the Ages
## 5  28978                                       The Circus
## 6    776                            The Rules of the Game
## 7  30588                                 Monsieur Verdoux
## 8   3086                                     The Lady Eve
## 9    212                             Arsenic and Old Lace
## 10 17057                                In a Lonely Place
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                            overview
## 1                                                                                                                                                                                                                                      A woman returning home falls asleep and has vivid dreams that may or may not be happening in reality. Through repetitive images and complete mismatching of the objective view of time and space, her dark inner desires play out on-screen.
## 2                                                                                                                                                                                                                                         The Marx Brothers take on high society and the opera world to bring two lovers together. A sly business manager and two wacky friends of two opera singers help them achieve success while humiliating their stuffy and snobbish enemies.
## 3                                                                                                                                                                                                                                                                                                                                                               God and Satan war over earth; to settle things, they wager on the soul of Faust, a learned and prayerful alchemist.
## 4                                                                                                                                                                                                                                                                                                                               The story of a poor young woman, separated by prejudice from her husband and baby, is interwoven with tales of intolerance from throughout history.
## 5                                                                                                                                                                                                                                                                                                                                       Charlie, a wandering tramp, becomes a circus handyman - soon the star of the show - and falls in love with the circus owner's stepdaughter.
## 6                                                                                                                                                                                                                                                                                                                                                                 A weekend at a marquis’ country château lays bare some ugly truths about a group of haut bourgeois acquaintances.
## 7                                                                                                     The film is about an unemployed banker, Henri Verdoux, and his sociopathic methods of attaining income. While being both loyal and competent in his work, Verdoux has been laid-off. To make money for his wife and child, he marries wealthy widows and then murders them. His crime spree eventually works against him when two particular widows break his normal routine.
## 8  It's no accident when wealthy Charles falls for Jean. Jean is a con artist with her sights set on Charles' fortune. Matters complicate when Jean starts falling for her mark. When Charles suspects Jean is a gold digger, he dumps her. Jean, fixated on revenge and still pining for the millionaire, devises a plan to get back in Charles' life. With love and payback on her mind, she re-introduces herself to Charles, this time as an aristocrat named Lady Eve Sidwich.
## 9                                                                                                                                                                  Mortimer Brewster, a newspaper drama critic, playwright, and author known for his diatribes against marriage, suddenly falls in love and gets married;  but when he makes a quick trip home to tell his two maiden aunts, he finds out his aunts' hobby - killing lonely old men and burying them in the cellar!
## 10                                                                                                                                                                                                                                                                                                                                                                      An aspiring actress begins to suspect that her temperamental and mentally impaired boyfriend is a murderer.
##    popularity release_date vote_average vote_count    score year month day
## 1       8.144   1943-01-01        7.680        339 2603.520 1943     1   1
## 2       8.586   1935-11-15        7.438        442 3287.596 1935    11  15
## 3       9.864   1926-10-13        7.911        310 2452.410 1926    10  13
## 4      10.373   1916-09-04        7.103        306 2173.518 1916     9   4
## 5      10.429   1928-01-06        8.005        713 5707.565 1928     1   6
## 6      10.454   1939-07-09        7.592        520 3947.840 1939     7   9
## 7      10.632   1947-09-26        7.782        379 2949.378 1947     9  26
## 8      11.183   1941-02-25        7.217        307 2215.619 1941     2  25
## 9      11.212   1944-09-01        7.625        849 6473.625 1944     9   1
## 10     11.245   1950-05-17        7.548        506 3819.288 1950     5  17
##    year.range
## 1   1902-1950
## 2   1902-1950
## 3   1902-1950
## 4   1902-1950
## 5   1902-1950
## 6   1902-1950
## 7   1902-1950
## 8   1902-1950
## 9   1902-1950
## 10  1902-1950

По названиям фильмов: зачастую мелькают слова по типу “The”, “A”. Интересно, как много тайтлов начинаются с них

library("stringr")
MSTitle <- movieset |>
  mutate(firstWord = word(title, 1))
MSTitle |> group_by(firstWord)|>
  summarise(total.fw = n(), ratio = n()/nrow(MSTitle))|>as.data.frame()|>
  arrange(desc(total.fw))|>
  rmarkdown::paged_table()

Такая вот интересная статистика

MSTitle |> 
  filter(firstWord == "American")|>as.data.frame()|>
  rmarkdown::paged_table()

А что насчёт самых подробных тайтлов?

MSTitle <- MSTitle|>
  mutate(title.length = sapply(title, function(x) nchar(x)), 
         title.word.length = sapply(title, function(x) length(strsplit(x, " ")[[1]])))
MSTitle|>group_by(title.word.length)|>
  summarise(Number = n())|>as.data.frame()|>
  arrange(desc(Number))|>
  rmarkdown::paged_table()
MSTitle|>group_by(title.length)|>
  summarise(Number = n())|>as.data.frame()|>
  arrange(desc(Number))|>
  rmarkdown::paged_table()
MSMinWL <- MSTitle|>group_by(title.word.length)|>
  summarise(Number = n())|>as.data.frame()|>
  slice_min(order_by = Number, n = 50)

MSMinL <- MSTitle|>group_by(title.length)|>
  summarise(Number = n())|>as.data.frame()|>
  slice_min(order_by = Number, n = 50)
MSTitle|>
  filter(title.word.length %in% (MSMinWL|>pull(title.word.length)))|>
  as.data.frame()|>arrange(desc(title.word.length))|>
  rmarkdown::paged_table()
MSTitle|>
  filter(title.length %in% (MSMinL|>pull(title.length)))|>
  as.data.frame()|>arrange(desc(title.length))|>
  rmarkdown::paged_table()
MSTitle|>
  filter(title.length <= 10)|>
  as.data.frame()|>arrange(title.length)|>
  rmarkdown::paged_table()